getwd() install.packages(“epiR”) ## Installng tidyverse to use dplyr to rename the columns install.packages(‘tidyverse’)
Food_Sec <- data.frame(read.csv("dec21pub.csv"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ezids)
library(ggplot2)
library(epiR)
## Loading required package: survival
## Package epiR 2.0.52 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
##
FS_Subset <- subset(Food_Sec, HRINTSTA == 001 & HRSUPINT == 001 & HRFS12MD != -9)
FS_Subset <- subset(FS_Subset, select = c("HRHHID", "GESTFIPS", "HRNUMHOU", "HEFAMINC", "HESP1", "PTDTRACE", "PRCITSHP", "PEMJNUM", "PEHRUSL1", "PEEDUCA", "HRFS12MD", "PRNMCHLD"))
FS_Subset <- FS_Subset %>% rename("Id" = "HRHHID", "States" = "GESTFIPS", "Family_Size" = "HRNUMHOU", "Household_Income" = "HEFAMINC", "SNAP" = "HESP1", "Ethnicity" = "PTDTRACE", "Citizenship_status" = "PRCITSHP", "Number_of_Jobs" = "PEMJNUM", "Hours_on_Jobs" = "PEHRUSL1" , "Education_Level" = "PEEDUCA" , "FoodSecurity_score" = "HRFS12MD")
## Converting the all the columns to factors as they are all ordinal(except the Id, but since it's categorical i'm converting it into a factor too)
FS_Subset[] <- lapply( FS_Subset, factor)
str(FS_Subset)
## 'data.frame': 71472 obs. of 12 variables:
## $ Id : Factor w/ 27922 levels "5185410966","8178510165",..: 16600 9378 9378 8472 8472 7861 7861 19375 19375 24604 ...
## $ States : Factor w/ 51 levels "1","2","4","5",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Family_Size : Factor w/ 14 levels "1","2","3","4",..: 1 2 2 2 2 2 2 2 2 1 ...
## $ Household_Income : Factor w/ 16 levels "1","2","3","4",..: 16 14 14 12 12 13 13 9 9 11 ...
## $ SNAP : Factor w/ 5 levels "-3","-2","-1",..: 3 3 3 5 5 3 3 5 5 3 ...
## $ Ethnicity : Factor w/ 24 levels "1","2","3","4",..: 1 1 1 1 1 1 1 2 2 1 ...
## $ Citizenship_status: Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Number_of_Jobs : Factor w/ 4 levels "-1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hours_on_Jobs : Factor w/ 88 levels "-4","-1","0",..: 67 43 2 43 43 62 43 2 2 2 ...
## $ Education_Level : Factor w/ 17 levels "-1","31","32",..: 14 15 1 14 14 10 10 7 10 5 ...
## $ FoodSecurity_score: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 2 2 1 ...
## $ PRNMCHLD : Factor w/ 12 levels "0","1","2","3",..: 1 2 1 1 1 1 1 1 1 1 ...
##Exploring the Food Security Score
levels(FS_Subset$'FoodSecurity_score') <- c( "High Food Security", "Marginal Food Security", "Low Food Security", "Very Low Food Security")
summary(FS_Subset$'FoodSecurity_score')
## High Food Security Marginal Food Security Low Food Security
## 58901 5442 4820
## Very Low Food Security
## 2309
FS_Subset$FS_Status <- FS_Subset$FoodSecurity_score
levels(FS_Subset$FS_Status) <- c( "Food Secure", "Food Secure", "Food Insecure", "Food Insecure")
##1 :High Food Security, 2 : Marginal Food Security , 3 : Low Food Security, 4 : Very Low Food Security ,-9 : No Response ## Children’s Food Security Scale variables are coded as “Not in Universe” (-1) if there were no children in the household. ## Attachment 17, on page 288, contains details about the food security score. I have only given it a cursory glace, but we will have to read it in detail.
ggplot(FS_Subset, aes(x= FoodSecurity_score, y = ..prop.., group = 1)) + geom_bar( fill = "blue") + geom_text(aes(label = round((..prop..), 2)), stat = "count", hjust = 0, colour = "black") + coord_flip() + labs(x= " ", y= "Relative Frequency ") + scale_y_continuous(limits = c(0, 1))
df <- FS_Subset %>%
group_by(FS_Status) %>% # Variable to be transformed
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
ggplot(df, aes(x = "", y = perc, fill = FS_Status)) +
geom_col() +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y")
levels(FS_Subset$'Ethnicity') <- c('White only', 'Black only', 'American Indian, Alaskan native only', 'Asian Only', 'Hawaiian', 'White-black', 'White-AI', 'White-Asian', 'White-HP', 'Black-AI', 'Black-Asian', 'Black-HP', 'AI-Asian', 'AI-HP', 'Asian-HP', 'W-B-AI', 'W-B-A', 'W-B-HP', 'W-AI-A', 'W-AI-HP', 'W-A-HP', 'B-AI-A', 'W-B-AL-A', 'W-AI-A-HP', 'Other 3 race combo', 'Other 4 and 5 race combo')
summary(FS_Subset$'Ethnicity', title = "PTDTRACE")
## White only Black only
## 57253 7173
## American Indian, Alaskan native only Asian Only
## 952 3989
## Hawaiian White-black
## 329 531
## White-AI White-Asian
## 437 380
## White-HP Black-AI
## 68 72
## Black-Asian Black-HP
## 33 4
## AI-Asian AI-HP
## 8 1
## Asian-HP W-B-AI
## 58 51
## W-B-A W-B-HP
## 16 7
## W-AI-A W-AI-HP
## 10 6
## W-A-HP B-AI-A
## 72 3
## W-B-AL-A W-AI-A-HP
## 1 18
## Other 3 race combo Other 4 and 5 race combo
## 0 0
summary(FS_Subset$'SNAP', title = "HESP1")
## -3 -2 -1 1 2
## 198 156 45644 7208 18266
levels(FS_Subset$'Citizenship_status') <- c('NATIVE, BORN IN THE UNITED STATES', 'NATIVE, BORN IN PUERTO RICO OR OTHER U.S. ISLAND AREAS', 'NATIVE, BORN ABROAD OF AMERICAN PARENT OR PARENTS', 'FOREIGN BORN, U.S. CITIZEN BY NATURALIZATION', 'FOREIGN BORN, NOT A CITIZEN OF THE UNITED STATES')
summary(FS_Subset$'Citizenship_status', title = "PRCITSHP")
## NATIVE, BORN IN THE UNITED STATES
## 62819
## NATIVE, BORN IN PUERTO RICO OR OTHER U.S. ISLAND AREAS
## 265
## NATIVE, BORN ABROAD OF AMERICAN PARENT OR PARENTS
## 523
## FOREIGN BORN, U.S. CITIZEN BY NATURALIZATION
## 4051
## FOREIGN BORN, NOT A CITIZEN OF THE UNITED STATES
## 3814
for(i in levels(FS_Subset$Ethnicity)){
print(ggplot(subset(FS_Subset, Ethnicity == i ) , aes(x= Ethnicity, color =FoodSecurity_score, fill = FoodSecurity_score)) + geom_bar( aes(y = (..count..)/sum(..count..)*100), position = "dodge") + labs(x= " ", y= "Percentage "))
}
selected <- c('White only', 'Black only', 'American Indian, Alaskan native only', 'Asian Only', 'Hawaiian')
FS_ss <- FS_Subset[FS_Subset$Ethnicity %in% selected,]
FS_ss <- droplevels(FS_ss)
chi_test <- table(FS_ss$Ethnicity, FS_ss$FS_Status)
chisq.test(chi_test)
##
## Pearson's Chi-squared test
##
## data: chi_test
## X-squared = 996.87, df = 4, p-value < 2.2e-16
for(i in levels(FS_Subset$Citizenship_status)){
print(ggplot(subset(FS_Subset, Citizenship_status == i ) , aes(x= Citizenship_status, color =FoodSecurity_score, fill = FoodSecurity_score)) + geom_bar( aes(y = (..count..)/sum(..count..)*100), position = "dodge")+ labs(x= " ", y= "Percentage "))
}
chi_test_CS <- table(FS_ss$Citizenship_status, FS_ss$FoodSecurity_score)
chisq.test(chi_test_CS)
##
## Pearson's Chi-squared test
##
## data: chi_test_CS
## X-squared = 441.47, df = 12, p-value < 2.2e-16
table(FS_Subset$SNAP)
##
## -3 -2 -1 1 2
## 198 156 45644 7208 18266
## Only would be considering the levels 1 and 2. here -1, the one with most frequency, means that the response is not in universe. Ideally, I should substitute this but due to time constraints I won't be doing that. -3 and -2 mean refuse to respond and no response.
FS_SNAP <- subset(FS_Subset, FS_Subset$SNAP != -1)
FS_SNAP <- droplevels(FS_SNAP)
levels(FS_SNAP$SNAP) <- c('Refused', 'Dont Know', 'Uses SNAP', 'No SNAP')
ggplot(FS_SNAP, aes(x = SNAP, fill = FoodSecurity_score)) + geom_bar(position = "fill")
ggplot(FS_SNAP, aes(x = SNAP, fill = FS_Status)) + geom_bar(position = "fill")
chi_test_SNAP <- table(FS_SNAP$SNAP, FS_SNAP$FS_Status)
chi_test_SNAP
##
## Food Secure Food Insecure
## Refused 193 5
## Dont Know 139 17
## Uses SNAP 4471 2737
## No SNAP 14258 4008
chisq.test(chi_test_SNAP)
##
## Pearson's Chi-squared test
##
## data: chi_test_SNAP
## X-squared = 764.1, df = 3, p-value < 2.2e-16
table(FS_SNAP$SNAP)
##
## Refused Dont Know Uses SNAP No SNAP
## 198 156 7208 18266
SNAP_Odds <- droplevels(subset(FS_SNAP, SNAP != "Refused" & SNAP != "Dont Know"))
TAB <- table(SNAP_Odds$SNAP, SNAP_Odds$FS_Status)
barplot(TAB, beside= T, legend = T)
epi.2by2(TAB, method = "cohort.count")
## Outcome + Outcome - Total Inc risk * Odds
## Exposed + 4471 2737 7208 62.0 1.63
## Exposed - 14258 4008 18266 78.1 3.56
## Total 18729 6745 25474 73.5 2.78
##
## Point estimates and 95% CIs:
## -------------------------------------------------------------------
## Inc risk ratio 0.79 (0.78, 0.81)
## Odds ratio 0.46 (0.43, 0.49)
## Attrib risk in the exposed * -16.03 (-17.30, -14.76)
## Attrib fraction in the exposed (%) -25.84 (-28.34, -23.40)
## Attrib risk in the population * -4.54 (-5.34, -3.73)
## Attrib fraction in the population (%) -6.17 (-6.68, -5.66)
## -------------------------------------------------------------------
## Uncorrected chi2 test that OR = 1: chi2(1) = 682.162 Pr>chi2 = <0.001
## Fisher exact test that OR = 1: Pr>chi2 = <0.001
## Wald confidence limits
## CI: confidence interval
## * Outcomes per 100 population units
food_ave <- subset(FS_Subset, select = c("Id", "Number_of_Jobs", "Hours_on_Jobs", "Education_Level", "FoodSecurity_score"))
head(food_ave)
## Id Number_of_Jobs Hours_on_Jobs Education_Level
## 1 404006407110031 -1 65 43
## 7 147240092351000 -1 40 44
## 8 147240092351000 -1 -1 -1
## 16 128450301231000 -1 40 43
## 17 128450301231000 -1 40 43
## 18 114580195861000 -1 60 39
## FoodSecurity_score
## 1 High Food Security
## 7 High Food Security
## 8 High Food Security
## 16 High Food Security
## 17 High Food Security
## 18 High Food Security
str(food_ave)
## 'data.frame': 71472 obs. of 5 variables:
## $ Id : Factor w/ 27922 levels "5185410966","8178510165",..: 16600 9378 9378 8472 8472 7861 7861 19375 19375 24604 ...
## $ Number_of_Jobs : Factor w/ 4 levels "-1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hours_on_Jobs : Factor w/ 88 levels "-4","-1","0",..: 67 43 2 43 43 62 43 2 2 2 ...
## $ Education_Level : Factor w/ 17 levels "-1","31","32",..: 14 15 1 14 14 10 10 7 10 5 ...
## $ FoodSecurity_score: Factor w/ 4 levels "High Food Security",..: 1 1 1 1 1 1 1 2 2 1 ...
food_ave$Number_of_Jobs = as.factor(food_ave$Number_of_Jobs)
food_ave$Education_Level = as.factor(food_ave$Education_Level)
food_ave$FoodSecurity_score = as.factor(food_ave$FoodSecurity_score)
food_ave$Hours_on_Jobs = as.numeric(food_ave$Hours_on_Jobs)
str(food_ave)
## 'data.frame': 71472 obs. of 5 variables:
## $ Id : Factor w/ 27922 levels "5185410966","8178510165",..: 16600 9378 9378 8472 8472 7861 7861 19375 19375 24604 ...
## $ Number_of_Jobs : Factor w/ 4 levels "-1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hours_on_Jobs : num 67 43 2 43 43 62 43 2 2 2 ...
## $ Education_Level : Factor w/ 17 levels "-1","31","32",..: 14 15 1 14 14 10 10 7 10 5 ...
## $ FoodSecurity_score: Factor w/ 4 levels "High Food Security",..: 1 1 1 1 1 1 1 2 2 1 ...
plot(food_ave$Number_of_Jobs)
ggplot(food_ave) +
geom_bar(aes(x = Number_of_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
ggplot(subset(food_ave, Number_of_Jobs == -1)) +
geom_bar(aes(x = Number_of_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
ggplot(subset(food_ave, Number_of_Jobs == 2)) +
geom_bar(aes(x = Number_of_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
ggplot(subset(food_ave, Number_of_Jobs == 3)) +
geom_bar(aes(x = Number_of_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
ggplot(subset(food_ave, Number_of_Jobs == 4)) +
geom_bar(aes(x = Number_of_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
Here * 1 High Food Security * 2 Marginal Food Security * 3 Low Food Security * 4 Very Low Food Security * -9 No Response
In the above graphs, people say that they have High Food Security irrespective of the number of jobs. But lets use Chi-Square test to see if they are really independent of each other
contable <- table(food_ave$Number_of_Jobs, food_ave$FoodSecurity_score)
xkabledply(contable)
| High Food Security | Marginal Food Security | Low Food Security | Very Low Food Security | |
|---|---|---|---|---|
| -1 | 57321 | 5318 | 4701 | 2244 |
| 2 | 1425 | 115 | 112 | 54 |
| 3 | 136 | 7 | 7 | 9 |
| 4 | 19 | 2 | 0 | 2 |
## Chi-Square Test
chi_test <- chisq.test(contable, correct = F)
## Warning in chisq.test(contable, correct = F): Chi-squared approximation may be
## incorrect
chi_test
##
## Pearson's Chi-squared test
##
## data: contable
## X-squared = 12.429, df = 9, p-value = 0.1902
The result gave warnings as the estimated value for some cells are very low. From the test, we see that the P-value for the Chi-square test is 0.3871 which is greater than the default value 0.05. Hence we accept the null hypothesis and hence, Number of Jobs doesn’t significantly affect the Food Security.
###Education Level
edu_freq <- as.data.frame(table(food_ave$Education_Level))
edu_freq
## Var1 Freq
## 1 -1 12558
## 2 31 144
## 3 32 251
## 4 33 446
## 5 34 912
## 6 35 1241
## 7 36 1538
## 8 37 1625
## 9 38 910
## 10 39 16004
## 11 40 9492
## 12 41 2454
## 13 42 3257
## 14 43 12871
## 15 44 5720
## 16 45 858
## 17 46 1191
plot(food_ave$Education_Level)
ggplot(food_ave) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
The response is understood as follows:
31 LESS THAN 1ST GRADE 32 1ST, 2ND, 3RD OR 4TH GRADE 33 5TH OR 6TH GRADE 34 7TH OR 8TH GRADE 35 9TH GRADE 36 10TH GRADE 37 11TH GRADE 38 12TH GRADE NO DIPLOMA 39 HIGH SCHOOL GRAD-DIPLOMA OR EQUIV (GED) 40 SOME COLLEGE BUT NO DEGREE 41 ASSOCIATE DEGREE-OCCUPATIONAL/VOCATIONAL 42 ASSOCIATE DEGREE-ACADEMIC PROGRAM 43 BACHELOR’S DEGREE (EX: BA, AB, BS) 44 MASTER’S DEGREE (EX: MA, MS, MEng, MEd, MSW) 45 PROFESSIONAL SCHOOL DEG (EX: MD, DDS, DVM) 46 DOCTORATE DEGREE (EX: PhD, EdD)
ggplot(subset(food_ave, Education_Level == -1)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 31)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 32)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 33)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 34)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 35)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 36)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 37)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 38)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 39)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 40)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 41)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 42)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 43)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 44)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 45)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
ggplot(subset(food_ave, Education_Level == 46)) +
geom_bar(aes(x = Education_Level, fill = FoodSecurity_score), position = "dodge") +
xlab("Education") + ylab("Count")
contable_edu <- table(food_ave$Education_Level, food_ave$FoodSecurity_score)
xkabledply(contable_edu)
| High Food Security | Marginal Food Security | Low Food Security | Very Low Food Security | |
|---|---|---|---|---|
| -1 | 9703 | 1238 | 1199 | 418 |
| 31 | 95 | 20 | 17 | 12 |
| 32 | 158 | 36 | 41 | 16 |
| 33 | 276 | 55 | 82 | 33 |
| 34 | 621 | 111 | 129 | 51 |
| 35 | 894 | 141 | 134 | 72 |
| 36 | 1094 | 190 | 167 | 87 |
| 37 | 1168 | 182 | 178 | 97 |
| 38 | 639 | 106 | 120 | 45 |
| 39 | 12435 | 1575 | 1328 | 666 |
| 40 | 7751 | 744 | 626 | 371 |
| 41 | 2033 | 182 | 160 | 79 |
| 42 | 2773 | 215 | 162 | 107 |
| 43 | 11860 | 477 | 351 | 183 |
| 44 | 5416 | 140 | 108 | 56 |
| 45 | 828 | 17 | 7 | 6 |
| 46 | 1157 | 13 | 11 | 10 |
## Chi-Square Test
chi_test <- chisq.test(contable_edu, correct = F)
## Warning in chisq.test(contable_edu, correct = F): Chi-squared approximation may
## be incorrect
chi_test
##
## Pearson's Chi-squared test
##
## data: contable_edu
## X-squared = 3136.8, df = 48, p-value < 2.2e-16
Education_Level is having a significant effect on the
Food Security of People
##Hours_on_Jobs
plot(food_ave$Hours_on_Jobs)
ggplot(food_ave) +
geom_bar(aes(x = Hours_on_Jobs, fill = FoodSecurity_score), position = "dodge") +
xlab("Jobs") + ylab("Count")
ggplot(food_ave, aes(Hours_on_Jobs, fill = FoodSecurity_score)) +
geom_histogram(binwidth=5)
ggplot(food_ave, aes(Hours_on_Jobs)) +
geom_freqpoly(binwidth=1)
ggplot(food_ave, aes(Hours_on_Jobs, color= FoodSecurity_score)) +
geom_freqpoly(binwidth=25)
ggplot(food_ave, aes(Hours_on_Jobs, fill = FoodSecurity_score)) +
geom_histogram(binwidth=5) +
facet_wrap(~FoodSecurity_score)
vary_hours <- nrow(food_ave[food_ave$Hours_on_Jobs == -4,])
vary_hours
## [1] 0
There are 0 whose working hours vary.
summary(food_ave$Hours_on_Jobs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 2.00 19.58 43.00 88.00
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(food_ave$Hours_on_Jobs)
## [1] 2
#ggplot(food_ave, aes(x=FoodSecurity_score, y=Hours_on_Jobs)) +
# geom_boxplot( colour=c("red","blue","violet","navy blue","black"), outlier.shape=8, outlier.size=4) +
#labs(title="Boxplot for Hours on Jobs", x="Control / Treatment", y = "Hours on Job")
h_anova <- aov(Hours_on_Jobs ~ FoodSecurity_score, data = food_ave)
h_tuskey <- TukeyHSD(h_anova)
summary(h_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## FoodSecurity_score 3 279623 93208 214.6 <2e-16 ***
## Residuals 71468 31045414 434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
h_tuskey
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Hours_on_Jobs ~ FoodSecurity_score, data = food_ave)
##
## $FoodSecurity_score
## diff lwr upr
## Marginal Food Security-High Food Security -4.2140290 -4.972646 -3.4554123
## Low Food Security-High Food Security -5.6863555 -6.488530 -4.8841810
## Very Low Food Security-High Food Security -6.0702228 -7.206149 -4.9342962
## Low Food Security-Marginal Food Security -1.4723264 -2.531399 -0.4132541
## Very Low Food Security-Marginal Food Security -1.8561937 -3.186036 -0.5263518
## Very Low Food Security-Low Food Security -0.3838673 -1.739029 0.9712947
## p adj
## Marginal Food Security-High Food Security 0.0000000
## Low Food Security-High Food Security 0.0000000
## Very Low Food Security-High Food Security 0.0000000
## Low Food Security-Marginal Food Security 0.0020125
## Very Low Food Security-Marginal Food Security 0.0019070
## Very Low Food Security-Low Food Security 0.8860333
2 - 1, 3 -1, 4 -1, 3 -2, 4 -2 have significant difference in there mean. getmode(food_ave$Hours_on_Jobs)
#ggplot(food_ave, aes(x=FoodSecurity_score, y=Hours_on_Jobs)) +
# geom_boxplot( colour=c("red","blue","violet","navy blue","black"), outlier.shape=8, outlier.size=4) +
# labs(title="Boxplot for Hours on Jobs", x="Control / Treatment", y = "Hours on Job")
There are 0 whose working hours vary. summary(food_ave$Hours_on_Jobs)
8657d5728a00f036f19d3ba04f8e0d67a4b3431f
-States, Family size, and Household Income
levels(FS_Subset$'States') <- c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY' )
summary(FS_Subset$'States', title = "GESTFIPS")
## AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA
## 1207 970 1080 1258 6975 764 593 832 1208 2738 1421 1125 1304 2052 1265 884
## KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM
## 952 808 1606 564 963 1352 1762 999 1505 1099 1255 791 1007 994 1397 1242
## NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA
## 2580 1503 1143 1681 985 1247 1928 647 1028 840 1452 3946 1306 1030 1301 1404
## WV WI WY
## 1214 1092 1173
California has the highest number of respondents (6975), whereas Maine has the smallest number of respondents (564). In order to compare, I’m choosing states which has similar number of respondents. Alabama 1207 and Washington DC 1207, Florida 2738 and New York 2580, IL 2052 and PA 1928.
for(i in levels(FS_Subset$States)){
print(ggplot(subset(FS_Subset, States == i ) , aes(x= States, color =FoodSecurity_score, fill = FoodSecurity_score)) + geom_bar( aes(y = (..count..)/sum(..count..)*100), position = "dodge"))
}
#library(ggplot2)
#qqplot(food_una$States, food_una$FoodSecurity_score, plot.it = TRUE, xlab="States", ylab="Food security Score", main="QQ Plot for Food security vs States")
#hist(x=States )
<<<<<<< HEAD
#food_una <- subset(FS_Subset, select = c("HRHHID", "GESTFIPS", "HRNUMHOU", "HEFAMINC", "HRFS12MD"))
food_una <- subset(FS_Subset, select = c("Id", "States","Family_Size","Household_Income","FoodSecurity_score", "PRNMCHLD"))
head(food_una)
## Id States Family_Size Household_Income FoodSecurity_score
## 1 404006407110031 AL 1 16 High Food Security
## 7 147240092351000 AL 2 14 High Food Security
## 8 147240092351000 AL 2 14 High Food Security
## 16 128450301231000 AL 2 12 High Food Security
## 17 128450301231000 AL 2 12 High Food Security
## 18 114580195861000 AL 2 13 High Food Security
## PRNMCHLD
## 1 0
## 7 1
## 8 0
## 16 0
## 17 0
## 18 0
Reference to household income: 1 LESS THAN $5,000
2 5,000 TO 7,499
3 7,500 TO 9,999
4 10,000 TO 12,499
5 12,500 TO 14,999
6 15,000 TO 19,999
7 20,000 TO 24,999
8 25,000 TO 29,999
9 30,000 TO 34,999
10 35,000 TO 39,999
11 40,000 TO 49,999
12 50,000 TO 59,999
13 60,000 TO 74,999
14 75,000 TO 99,999
15 100,000 TO 149,999
16 150,000 OR MORE
income_t <- table(FS_Subset$Household_Income, FS_Subset$FoodSecurity_score)
chisq.test(income_t)
##
## Pearson's Chi-squared test
##
## data: income_t
## X-squared = 9512.9, df = 45, p-value < 2.2e-16
We can say that Household income is affecting food insecurity.
family_t <- table(FS_Subset$Family_Size, FS_Subset$FoodSecurity_score)
chisq.test(family_t)
## Warning in chisq.test(family_t): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: family_t
## X-squared = 1691.7, df = 39, p-value < 2.2e-16
We can say that Family size is affecting food insecurity.
boxplot(FoodSecurity_score ~ Family_Size, data = food_una, col=c("blue", "red", "green", "yellow", "purple", "orange"), main="Family size - Food security")
boxplot(FoodSecurity_score ~ Household_Income, data = food_una, col=c("blue", "red", "green", "yellow", "purple", "orange"), main="Household Income - Food security")
boxplot(FoodSecurity_score ~ States, data = food_una, col=c("blue", "red", "green", "yellow", "purple", "orange"))
As you can see from the boxplot, whenever family size bigger (more than
6 people), food insecurity is high. Also, household income has direct
effect on food security. When household income is higher than 40k, food
security score is low.
library(ggplot2)
qqplot(food_una$Household_Income, food_una$FoodSecurity_score, plot.it = TRUE, xlab="Household income", ylab="Food security Score", col=food_una$Household_Income, main="QQ Plot for Food security vs Household income")
summary(FS_Subset$Family_Size)
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 9210 21816 12753 13724 7810 3660 1344 560 288 120 88 72 13
## 14
## 14
library(ggplot2)
qqplot(food_una$Family_Size, food_una$FoodSecurity_score, plot.it = TRUE, xlab="Family size", ylab="Food security Score", col=food_una$Family_Size, main="QQ Plot for Food security vs Family")
ggplot(data=food_una, mapping = aes(x = Family_Size, y = FoodSecurity_score)) +
geom_point(size=5)+aes(color = Family_Size) +
theme_bw()
ggplot(data=food_una, mapping = aes(x = Family_Size, y = FoodSecurity_score)) +
geom_point(size=5)+aes(color = Household_Income) +
theme_bw()